setwd("") # script reads data from & produces output in this directory
# two regression tables are generated in the working directory, 10 plots inside the plots pane (rstudio)

#________________________________________________________________libraries.
library(cjoint) # for amce estimator plots
### note: cjoint::amce() includes treatment as "respondent-varying" covariate
### and presents conjoint results by treatment group only. this approach does not give
### an ATE estimate explicitly, we thus also build our own OLS models with clustered SE to get these
library(lmtest) # clustered standard errors for OLS models
library(texreg) # generate regression tables

library(ggplot2)
library(stringr)
library(reshape2)

#________________________________________________________________set plot theme.
theme <- theme(
  legend.position = "none",

  panel.background = element_rect(fill = "gray90", color = "gray90"),
  strip.text = element_text(size = 8),
  strip.background = element_rect(colour = "white"),

  panel.grid.major = element_line(colour = "white", linetype = 1),
  panel.grid.minor = element_blank(),

  plot.title = element_text(face = "bold", size = 8, hjust = 0.5),

  axis.text.x = element_text(size = 8, margin = unit(c(t = 2.5, r = 0, b = 0, l = 0), "mm"), hjust = 0.5, vjust = 1),
  axis.title.x = element_text(size=8, hjust = 0.5),

  axis.text.y = element_text(size = 8, margin = unit(c(t = 0, r = 2.5, b = 0, l = 0), "mm"), hjust = 0, vjust = 0.5),
  axis.title.y = element_text(size = 8, vjust = 0.01, hjust = 0.1),

  plot.caption = element_text(size=8),
)


#________________________________________________________________STUDY 1: data setup
file <- "May+2020+Public+Affairs+Survey+V2_May+24,+2020_10.36 - Copy.csv"

# CREATE WIDE DATA SET. read data dealing with Qualtrics' "triple header"

# store first row of raw data set as vector of variable names
colnames <- read.csv(file, header = FALSE, nrows = 1, as.is = TRUE)

# read rest of data starting with row number 4
# we call this the wide data format (useful for recoding and exploratory analysis)
wide <- read.csv(file, skip = 3, header = FALSE, stringsAsFactors = FALSE)

# attach variable names saved earlier to rest of data
colnames(wide) <- colnames

# discard preview (non-Dynata) and screened out respondents 
# screened-out respondents did not comply with Dynata's quotas

# note: the date string, based on which we filter preview respondents,
# may be read with different default formatting on different computers.
# ours looks like this: 06/08/2020 10:35 BST. if filtering below (row 69) produces an empty dataframe,
# then this formatting needs to be changed, e.g. "%Y-%m-%d %H:%M:%S".
# confirm formatting with head(wide$StartDate)
BST <- "%d/%m/%Y %H:%M"
wide$StartDate <- strptime(wide$StartDate, format = BST)

# cutoff time: the survey was not live before that
cutoff <- strptime("20/05/2020 10:35:00 BST", format = BST)

# execute subsetting based on cutoff date:
wide <- wide[wide$StartDate > cutoff,]

# missing data on demographics means respondent was screened out
# next line removes these
wide <- wide[!is.na(wide$Q2),]

# LONG DATA SET: this is shaped via package 'cjoint'
# we use the long data format to build models where the unit of analysis is
# the individual app profile evaluated, clustered within trials (pairing) and respondents
long <- read.qualtrics(
  file,
  responses = paste0('Q',c(23:27)),
  respondentID = 'ResponseId', 
  covariates = c('StartDate','ctrace_pre_stimulus_DO'),
  new.format = TRUE,
  letter = 'F'
)

# same cutoff process as above rows 52 to 71.
long$StartDate <- strptime(long$StartDate, format = BST)
long <- long[long$StartDate > cutoff,]

# clean up attribute names and empty factor levels produced by qualtrics
long <- long[long$Contacts.uploaded != "" & long$Location.uploaded != "" & long$Purpose.of.app != "",]

long$Data.storage <- str_replace(long$Data.storage, "\\.", "")
long$Data.storage <- factor(long$Data.storage)
long$Contacts.uploaded <- factor(long$Contacts.uploaded)
long$Location.uploaded <- factor(long$Location.uploaded)
long$Purpose.of.app <- factor(long$Purpose.of.app)

# discard all missing data.
long <- na.omit(long)

# recode treatment variable (which data breach stimulus was displayed?)
long$treatment <- as.factor(
  ifelse(
    long$ctrace_pre_stimulus_DO == "Q22", "Treated", "Control"
  )
)

# reorder attribute levels of data storage
long$Data.stored.until <- factor(
  long$Data.stored.until, levels = 
    c("Not stored","Data kept indefinitely",
      "Covid-19 tests are widely available",
      "An effective vaccine for Covid-19 is widely available")
)

# reorder attribute levels of contacts uploaded
long$Contacts.uploaded <- factor(
  long$Contacts.uploaded, levels = 
    c("None","Contact with a person having symptoms","All contacts regardless of symptoms")
)

# to long data, add moderators (covariates) from wide data: nhs and trust in gov't
#
# (1) trust in nhs q6_1
long <- merge(
  long,
  wide[,c("ResponseId","Q6_1")],
  by.x = "Response.ID", by.y = "ResponseId",
  all.x = TRUE
)

# (2) trust in gov't q8
wide$Q8[wide$Q8 == 5] <- NA

long <- merge(
  long,
  wide[,c("ResponseId","Q8")],
  by.x = "Response.ID", by.y = "ResponseId",
  all.x = TRUE
)

# informative variable names and factor levels:
long$trustNHS <- as.factor(
  ifelse(
    long$Q6_1 < mean(long$Q6_1), "Low", "High"
  )
)

long$govPerform <- as.factor(
  ifelse(
    long$Q8 %in% 1:2, "Bad", "Good"
  )
)

# subset data "centralised systems only," this is dataset long2
long2 <- long[long$Data.storage!="Decentralised, locally at device",]

# remove empty factor levels after subsetting
long2$Data.storage <- factor(long2$Data.storage)
long2$Data.stored.until <- factor(long2$Data.stored.until)

# store attribute names for later use
anames <- names(long)[c(9,11,13,19,24)]

a<- list()

for (i in anames){
  a[[i]] <- levels(long[,i])
}

#______________________________________________________________Study 1: Amce plots
# data storage plot
amce1 <- amce(
  respondent.id = 'respondent', 
  formula = selected ~ treatment * Data.storage,
  data = long,
  respondent.varying = "treatment"
)

plot(amce1,
     main = "Preferences across data storage systems\n(dependent attributes)",
     plot.display = "interaction",
     plot.theme = theme,
     attribute.names = "Data storage",
     colors = c("orangered"),
     xlim = c(-0.12,+0.12),
)

# data privacy attributes plot
amce2 <- amce(
  respondent.id = 'respondent', 
  formula = selected ~ treatment * Data.storage + treatment * Data.stored.until + treatment * Location.uploaded + treatment * Contacts.uploaded + treatment * What.constitutes.a.contact ,
  data = long2,
  respondent.varying = "treatment"
)

attributes <- c(
  "Data storage",
  "Data stored until",
  "Location uploaded",
  "Contacts uploaded",
  "What constitutes contact"
)

plot(amce2,
     main = "Privacy peferences within centralised systems",
     plot.display = "interaction",
     plot.theme = theme,
     attribute.names = attributes,
     colors = c("black","orangered"),
     xlim = c(-0.22,+0.22),
)

#______________________________________________________________Study 1: OLS MODELS with clustered errors
covariates <- c("treatment", "Q6_1", "Q8")

models <- list()

# fit models
for (i in covariates){
  
  models[[i]] <- glm(
    selected ~ get(i) * Data.storage,
    data = long,
    family = binomial(link = "logit")
  )
}

# clustered standard errors:
se <- list()
pval <- list()

for (i in 1:3){
  
  temp <- coeftest(models[[i]],vcov = vcovHC(models[[i]], type = 'HC2', cluster = "respondent"))
  
  se[[i]] <- temp[,2]
  pval[[i]] <- temp[,4]
}


# cleaning up coefficient names for better display in tables
names(models[[1]]$coefficients) <- str_replace(
  names(models[[1]]$coefficients), "Treated",""
)

# more cleaning up coefficient names for better display in tables
coefPretty <- str_replace(names(models[[1]]$coefficients), "get\\(i\\)","Covariate")
coefPretty <- str_replace(coefPretty, "Covariate:","Covariate x ")
coefPretty[1] <- "Intercept"
coefPretty <- str_replace(coefPretty, paste(anames, collapse = "|"),"")

# regression table column headings
modPretty <- c("Treatment","High trust in NHS","Government handles pandemic well")

htmlreg(models, 
          override.se = se, 
          override.pvalues = pval,
          custom.model.names = modPretty, 
          custom.coef.names = coefPretty,
          "data_storage_models.doc"
)

# new models for data privacy attributes on subset of data, same process as above
models <- list()

for (i in covariates){
  
  models[[i]] <- glm(
    selected ~ get(i) * Data.storage + 
      get(i) * Data.stored.until + 
      get(i) * Location.uploaded + 
      get(i) * Contacts.uploaded +
      get(i) * What.constitutes.a.contact,
    
    data = long2,
    
    family = binomial(link = "logit")
    
  )
}

# clustered standard errors:
se <- list()
pval <- list()

for (i in 1:3){
  
  temp <- coeftest(models[[i]],vcov = vcovHC(models[[i]], type = 'HC2', cluster = "respondent"))
  
  se[[i]] <- temp[,2]
  pval[[i]] <- temp[,4]
}


names(models[[1]]$coefficients) <- str_replace(
  names(models[[1]]$coefficients), "Treated",""
)

coefPretty <- str_replace(names(models[[1]]$coefficients), "get\\(i\\)","Covariate")
coefPretty <- str_replace(coefPretty, "Covariate:","Covariate x ")
coefPretty[1] <- "Intercept"
coefPretty <- str_replace(coefPretty, paste(anames, collapse = "|"),"")

modPretty <- c("Treatment","High trust in NHS","Government handles pandemic well")

htmlreg(models, 
          override.se = se, 
          override.pvalues = pval,
          custom.model.names = modPretty, 
          custom.coef.names = coefPretty,
          "privacy_attribute_models.doc"
)

#______________________________________________________Study 1 appendix.
wide$gender <- ifelse(
  wide$Q2 == 1, "male", ifelse(
    wide$Q2 == 2, "female", "other"
  )
)

wide$location <- ifelse(
  wide$Q4 == 1, "EE", ifelse(
    wide$Q4 == 2, "EM", ifelse(
      wide$Q4 == 3, "Lon", ifelse(
        wide$Q4 == 4, "NE", ifelse(
          wide$Q4 == 5, "NW", ifelse(
            wide$Q4 == 6, "NI", ifelse(
              wide$Q4 == 7, "Scot", ifelse(
                wide$Q4 == 8, "SE", ifelse(
                  wide$Q4 == 9, "SW", ifelse(
                    wide$Q4 == 10, "Wales", ifelse(
                      wide$Q4 == 11, "WM", "YH"
                    ))
                )
              )
            )
          )
        )
      )
    )
  )
)

wide$age <- ifelse(
  (2020 - wide$Q3) %in% 18:24, "18--24", ifelse(
    (2020 - wide$Q3) %in% 25:34, "25--34", ifelse(
      (2020 - wide$Q3) %in% 35:44, "35--44", ifelse(
        (2020 - wide$Q3) %in% 45:54, "45--54", ifelse(
          (2020 - wide$Q3) %in% 55:64, "55--64", "65+"
        )
      )
    )
  )
)

# three descriptive plots: Appendix A1 left hand side
for (varstring in c("location","gender","age")) {
  print(
    ggplot(wide, aes(x = get(varstring), y = ..prop.., group = 1)) + 
      geom_bar(stat = "count",width = .65, fill = "gray21") +
      xlab("") +
      ylab("Proportion") +
      ggtitle(paste("Study 1",varstring)) +
      theme_bw() + 
      theme
    )
}

#____________attention checks: percentages
# Berinsky et al 2014 prop those who did not click 'neither agree or disagree'
prop.table(table(wide$Q18_4!=3))

# those incorrect on 2 > 1 
prop.table(table(!wide$Q42_4 %in% 4:5))

# results robust to data quality plot Fig. A2
high_quality <- wide$ResponseId[(wide$Q42_4 %in% 4:5) & (wide$Q18_4==3)]

amceX1 <- amce(
  respondent.id = 'respondent', 
  formula = selected ~ treatment * Data.storage,
  data = long[long$Response.ID %in% high_quality,],
  respondent.varying = "treatment"
)

plot(amceX1,
     main = "Preferences across data storage systems\n(dependent attributes)",
     plot.display = "interaction",
     plot.theme = theme,
     attribute.names = "High quality responses only: Data storage",
     colors = c("orangered"),
     xlim = c(-0.12,+0.12),
)


longX2 <- long[long$Data.storage!="Decentralised, locally at device",]
longX2$Data.storage <- factor(longX2$Data.storage)
longX2$Data.stored.until <- factor(longX2$Data.stored.until)

amceX2 <- amce(
  respondent.id = 'respondent', 
  formula = selected ~ treatment * Data.storage + treatment * Data.stored.until + treatment * Location.uploaded + treatment * Contacts.uploaded + treatment * What.constitutes.a.contact ,
  data = long2[long2$Response.ID %in% high_quality,],
  respondent.varying = "treatment"
)

attributes <- c(
  "Data storage",
  "Data stored until",
  "Location uploaded",
  "Contacts uploaded",
  "What constitutes contact"
)

plot(amceX2,
     main = "High quality responses only: Centralised systems",
     plot.display = "interaction",
     plot.theme = theme,
     attribute.names = attributes,
     colors = c("black","orangered"),
     xlim = c(-0.22,+0.22),
)

#__________________________________appendix B: treatment compliance
summary(
  table(
    # those who recall term "theft of personal data"
    grepl("1",wide$Q28),
    # those in treatment
    wide$ctrace_pre_stimulus_DO
  )
)

# randomisation results
prop.table(table(wide$ctrace_pre_stimulus_DO))

# display frequency of conjoint attributes 1
ggplot(data = long, aes(x = Data.storage, y = ..prop.., group = 1)) + 
  geom_bar() + 
  ggtitle("Diaplsy freq. of conjoint attributes: Data storage") +
  coord_flip() + 
  ylab("") +
  xlab("") +
  theme

# display frequency of conjoint attributes: rest of attributes within centralised systems only
long2$id <- row.names(long2)

options(warn=-1)
disp <- melt(
  long2,
  measure.vars = c("Data.stored.until","Contacts.uploaded","Location.uploaded","What.constitutes.a.contact"),
  id.vars = c("task","respondent","profile")
)

disp$variable <- as.character(disp$variable)
disp$variable[disp$variable=="What.constitutes.a.contact"] <- "What const. contact"

ggplot(data = disp, aes(x = value, y = ..prop.., group = 1)) + 
  geom_bar() + 
  ggtitle("Display freq. of conjoint attributes within centralised systems") +
  coord_flip() + 
  facet_grid(variable ~ ., scales = "free") +
  ylab("Proportion") +
  xlab("") +
  theme + 
  theme(
    axis.text.y = element_text(hjust = 1)
  )
  
#__________________appendix c: observed distribution of DV
# C1: unconditional on moderators
# populating dev: the data frame where each level's frequency is displayed against hypothetical random display
dev <- data.frame(
  "level"  = c(unlist(a)),
  "selected" = NA,
  "presented" = NA
)

# cleaning up attribute names
dev$attribute <- stringr::str_replace(rownames(dev),"[1234]", "")
rownames(dev) <- 1:nrow(dev)

# populating dev in a loop: calculate each attribute selected/displayed to get preference in %
attach(long)
for (i in 1:nrow(dev)){
  
  temp_presented <- table(get(dev$attribute[i]))
  temp_selected <- table(get(dev$attribute[i])[selected==1])
  
  col <- which(names(temp_presented)==dev$level[i])
  
  dev$presented[i] <- temp_presented[col]
  dev$selected[i] <- temp_selected[col]
  
}
detach(long)

# further cleaning of attribute names
dev$attribute <- stringr::str_replace_all(dev$attribute,"\\.", " ")

# calculate proportion
dev$proportion <- dev$selected / dev$presented

# calculate deviation from random choice (if attributes were selected 50/50)
dev$deviate <- dev$proportion - 0.5

# calculate confidence intervals around proportions
dev$lwr <- NULL
dev$upr <- NULL

for (i in 1:nrow(dev)){
  dev$lwr[i] <- prop.test(dev$selected[i],dev$presented[i])$conf.int[1] - 0.5
  dev$upr[i] <- prop.test(dev$selected[i],dev$presented[i])$conf.int[2] - 0.5
}

# for better display in plot
dev$level2 <- c("None","Symptoms","All",
                "Device only","Gov't","NHS",
                "Not stored","Indefinitely","Tests available","Vaccine available",
                "Exact","Postal area","None",
                "6 ft/15 mins","6 ft/5 mins","12 ft/15 mins","12 ft/5 mins")

# for better display in plot
dev$attribute[dev$attribute=="What constitutes a contact"] <- "What constitutes contact"
dev$attribute <- factor(dev$attribute, levels = c("Data storage", "Data stored until", "Location uploaded", "Contacts uploaded", "What constitutes contact"))

ggplot(dev, aes(y = deviate, x = level2)) + 
  geom_bar(stat = "identity", width = .65, position = position_dodge(width = .7), fill = "grey25") +
  facet_grid( . ~ attribute, scales = "free", space = "free") +
  ylab("Per cent deviation from random choice\n(50/50 when comparing two apps)") + 
  xlab("") + 
  ylim(c(-0.15,0.15)) +
  labs(title = "Respondents' data privacy and data security preferences regarding contact tracing applications") +
  theme + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

# C2, C3, C4: conditional on treatment and two moderators
# follows same steps as C1 see annotation above
# all three plots generated in this loop
variables <- c("Treatment","NHS","Gov't performance")
NHSmean <- mean(long$Q6_1)

for (variable in variables){
  
  # first generate descriptive data for control group/low trust/bad performance (baseline categories)
  dev <- data.frame(
    "level"  = c(unlist(a)),
    "selected" = NA,
    "presented" = NA
  )
  
  dev$attribute <- stringr::str_replace(rownames(dev),"[1234]", "")
  rownames(dev) <- 1:nrow(dev)
  
  if (variable == "Treatment"){
    control <- long[long$treatment == "Control",]
  } else if (variable == "NHS"){
    control <- long[long$Q6_1 < NHSmean,]
  } else {
    control <- long[long$Q8 < 3,]
  }
  
  attach(control)
  for (i in 1:nrow(dev)){
    
    temp_presented <- table(get(dev$attribute[i]))
    temp_selected <- table(get(dev$attribute[i])[selected==1])
    
    col <- which(names(temp_presented)==dev$level[i])
    
    dev$presented[i] <- temp_presented[col]
    dev$selected[i] <- temp_selected[col]
    
  }
  detach(control)
  
  dev$attribute <- stringr::str_replace_all(dev$attribute,"\\.", " ")
  dev$proportion <- dev$selected / dev$presented
  dev$deviate <- dev$proportion - 0.5
  
  dev$lwr <- NULL
  dev$upr <- NULL
  
  for (i in 1:nrow(dev)){
    dev$lwr[i] <- prop.test(dev$selected[i],dev$presented[i])$conf.int[1] - 0.5
    dev$upr[i] <- prop.test(dev$selected[i],dev$presented[i])$conf.int[2] - 0.5
  }
  
  dev$level2 <- c("None","Symptoms","All",
                  "Device only","Gov't","NHS",
                  "Not stored","Indefinitely","Tests available","Vaccine available",
                  "Exact","Postal area","None",
                  "6 ft/15 mins","6 ft/5 mins","12 ft/15 mins","12 ft/5 mins")
  
  control <- dev
  
  if (variable == "Treatment"){
    control$treat <- "Control group"
  } else if (variable == "NHS"){
    control$treat <- "Low trust in NHS"
  } else {
    control$treat <- "UK Gov't handles pandemic badly"
  }
  
  # next generate descriptive statistics for the comparison groups: treatment, high trust in NHS, or good performance
  dev <- data.frame(
    "level"  = c(unlist(a)),
    "selected" = NA,
    "presented" = NA
  )
  
  dev$attribute <- stringr::str_replace(rownames(dev),"[1234]", "")
  rownames(dev) <- 1:nrow(dev)
  
  if (variable == "Treatment"){
    treated <- long[long$treatment == "Treated",]
  } else if (variable == "NHS"){
    treated <- long[long$Q6_1 >= NHSmean,]
  } else {
    treated <- long[long$Q8 >= 3,]
  }
  
  attach(treated)
  for (i in 1:nrow(dev)){
    
    temp_presented <- table(get(dev$attribute[i]))
    temp_selected <- table(get(dev$attribute[i])[selected==1])
    
    col <- which(names(temp_presented)==dev$level[i])
    
    dev$presented[i] <- temp_presented[col]
    dev$selected[i] <- temp_selected[col]
    
  }
  detach(treated)
  
  dev$attribute <- stringr::str_replace_all(dev$attribute,"\\.", " ")
  dev$proportion <- dev$selected / dev$presented
  dev$deviate <- dev$proportion - 0.5
  
  dev$lwr <- NULL
  dev$upr <- NULL
  
  for (i in 1:nrow(dev)){
    dev$lwr[i] <- prop.test(dev$selected[i],dev$presented[i])$conf.int[1] - 0.5
    dev$upr[i] <- prop.test(dev$selected[i],dev$presented[i])$conf.int[2] - 0.5
  }
  
  dev$level2 <- c("None","Symptoms","All",
                  "Device only","Gov't","NHS",
                  "Not stored","Indefinitely","Tests available","Vaccine available",
                  "Exact","Postal area","None",
                  "6 ft/15 mins","6 ft/5 mins","12 ft/15 mins","12 ft/5 mins")
  
  treated <- dev
  
  if (variable == "Treatment"){
    treated$treat <- "Treated"
  } else if (variable == "NHS"){
    treated$treat <- "High trust in NHS"
  } else {
    treated$treat <- "UK Gov't handles pandemic well"
  }
  
  dev <- rbind(control,treated)
  dev$attribute <- factor(dev$attribute, levels = c("Data storage", "Data stored until", "Location uploaded", "Contacts uploaded", "What constitutes a contact"))
  
  print(
    
    ggplot(dev, aes(y = deviate, x = level2, fill = treat)) + 
      geom_bar(stat = "identity", width = .7, position = position_dodge(width = .7)) +
      scale_fill_manual(values = c("gray75","gray45")) +
      geom_errorbar(aes(ymin = lwr, ymax = upr), width = .25, linetype = 1, color = "grey20", size = 0, position = position_dodge(width = .7)) +
      facet_grid( . ~ attribute, scales = "free", space = "free") +
      ylab("Per cent deviation from random choice\n(50/50 when comparing two apps)") + 
      xlab("") + 
      ylim(c(-0.15,0.15)) +
      labs(title = "Respondents' data privacy and data security preferences regarding contact tracing applications\n") +
      theme + 
      theme(
        axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom",
        legend.title = element_blank()
      )
    
  )
  
}